Podsumowanie

1. Wyliczanie bibliotek

usePackage <- function(p) {
    if (!is.element(p, installed.packages()[,1]))
        install.packages(p, dep = TRUE, verbose = FALSE)
    require(p, character.only = TRUE, quietly = TRUE)
}

packageList <- c("dplyr", "matrixStats", "tidyr", "caret", "ggcorrplot", "plotly")

print("Użyte biblioteki to:")
## [1] "Uzyte biblioteki to:"
packageList
## [1] "dplyr"       "matrixStats" "tidyr"       "caret"       "ggcorrplot" 
## [6] "plotly"
sapply(packageList, usePackage)
##       dplyr matrixStats       tidyr       caret  ggcorrplot      plotly 
##        TRUE        TRUE        TRUE        TRUE        TRUE        TRUE

2. Zapewnianie powtarzalności obliczeń

setConsistency <- function() {
  set.seed(42)
}

3. Ładowanie danych

loadData <- function(urlPath, localFile = 'life_expectancy.tsv') {
  remote_data_file <- download.file(urlPath, destfile = localFile, quiet= TRUE)
  return(read.csv(TMP_LOCAL_FILE, quote='"'))
}
FILE_URL <- 'http://www.cs.put.poznan.pl/alabijak/emd/projekt/Life_Expectancy_Data.csv'
TMP_LOCAL_FILE <- 'life_expectancy.csv'

data <- loadData(FILE_URL, TMP_LOCAL_FILE)

4. Przetwarzanie brakujących danych

fixData <- function(df) {
  # Rename data
  fixedData <- rename(df,
     Life.Expectancy = Life.expectancy,
     Infant.Deaths = infant.deaths, 
     Percentage.Expenditure = percentage.expenditure,
     Under.Five.Deaths = under.five.deaths,
     Total.Expenditure = Total.expenditure,
     Thinness.Years.10.19 = thinness..1.19.years,
     Thinness.Years.5.9 = thinness.5.9.years,
     Income.Resources.Composition = Income.composition.of.resources)
  
  fixedData <- within(fixedData, Alcohol[is.na(Alcohol) & Year == 2015] <- 0)
  fixedData <- within(fixedData, Total.Expenditure[is.na(Total.Expenditure) & Year == 2015] <- 0)

  # fixedData$Country <- as.factor(fixedData$Country)
  # fixedData$Status <- as.factor(fixedData$Status)
  
  return(fixedData)
}

cleanData <- function (df) {
  return(na.omit(df))
}
  
removeChrColumns <- function(df) {
  library(dplyr)
  return(df %>% 
    select_if(~!is.character(.)))
}

fixedData <- fixData(data)
cleanedData <- cleanData(fixedData)
numericData <- removeChrColumns(cleanedData)

5. Rozmiar zbioru i podstawowe statystyki

print(paste('Zbiór danych zawiera', nrow(cleanedData), 'przykladów.'))
## [1] "Zbiór danych zawiera 1777 przykladów."
print(paste('Każdy z przykladow posiada', ncol(cleanedData), 'atrybutów.'))
## [1] "Kazdy z przykladow posiada 22 atrybutów."
library(dplyr)
library(tidyr)

cleanedData %>%
  tibble::as_tibble() %>% 
  select(Life.Expectancy, Adult.Mortality, Infant.Deaths, Population) %>%
  summarise(across(everything(), list(mean = mean, median = median, min = min, max = max), .names = "{.col}_{.fn}")) %>%
  gather(variable, value) %>%
  separate(variable, c("var", "funkcja"), sep = "\\_") %>%
  spread(var, value) %>%
  relocate(Life.Expectancy, .after = funkcja)
## # A tibble: 4 x 5
##   funkcja Life.Expectancy Adult.Mortality Infant.Deaths  Population
##   <chr>             <dbl>           <dbl>         <dbl>       <dbl>
## 1 max                89              723         1600   1293859294 
## 2 mean               69.4            168.          32.2   14430006.
## 3 median             71.7            148            3      1435568 
## 4 min                44                1            0           34

6. Szczegółowa analiza wartości atrybutów

## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

7. Korelacje między zmiennymi

library(ggcorrplot)

corr <- round(cor(numericData), 1)
p.mat <- cor_pmat(numericData)

ggcorrplot(corr, p.mat = p.mat, hc.order=TRUE, type='full', tl.cex=10, tl.srt=60)

8. Długość życia w zależności od krajów

library(plotly)

countryNames <- unique(cleanedData$Country)
countryData <- cleanedData %>%
  filter(Life.Expectancy != 0) %>%
  group_by(Country) %>%
  dplyr::summarize(Mean.Life.Expectancy = mean(Life.Expectancy), .groups = 'drop')

countryData <- arrange(countryData, desc(countryData$Mean.Life.Expectancy))

fig <- plot_ly(countryData, x = ~Country, y = ~Mean.Life.Expectancy, type = 'bar')

fig

9. Regresor

library(caret)

setConsistency()
inTraining <-
    createDataPartition(
        y = numericData$Life.Expectancy,
        p = .75,
        list = FALSE,)

trainingSet <- numericData[inTraining,]
testingSet <- numericData[-inTraining,]

rfGrid <- expand.grid(mtry = seq(2, ncol(numericData) - 1))
gridCtrl <- trainControl(
    method = "repeatedcv",
    number = 2,
    repeats = 5)

fitTune <- train(Life.Expectancy ~ .,
             data = trainingSet,
             method = "rf",
             metric = "RMSE",
             preProc = c("center", "scale"),
             trControl = gridCtrl,
             tuneGrid = rfGrid,
             ntree = 10)
fitTune
## Random Forest 
## 
## 1334 samples
##   19 predictor
## 
## Pre-processing: centered (19), scaled (19) 
## Resampling: Cross-Validated (2 fold, repeated 5 times) 
## Summary of sample sizes: 666, 668, 667, 667, 667, 667, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##    2    2.865870  0.8981423  2.008806
##    3    2.611478  0.9138562  1.823620
##    4    2.496997  0.9209252  1.729448
##    5    2.414434  0.9260085  1.651294
##    6    2.334129  0.9300500  1.604995
##    7    2.349975  0.9291291  1.599798
##    8    2.281503  0.9329390  1.568389
##    9    2.303874  0.9315994  1.572533
##   10    2.300910  0.9315208  1.563833
##   11    2.288248  0.9320849  1.549882
##   12    2.274257  0.9330773  1.553570
##   13    2.288128  0.9322114  1.554789
##   14    2.282676  0.9322473  1.545455
##   15    2.297981  0.9313793  1.561153
##   16    2.286173  0.9322201  1.551356
##   17    2.328094  0.9294172  1.578184
##   18    2.300977  0.9314149  1.557778
##   19    2.324170  0.9297220  1.572661
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 12.
ggplot(fitTune) + theme_bw()

rfTuneClasses <- predict(fitTune,
                         newdata = testingSet)

postResample(
  as.numeric(rfTuneClasses),
  testingSet$Life.Expectancy)
##      RMSE  Rsquared       MAE 
## 1.9878760 0.9487156 1.2810903